library(tidyverse)
# library(dict) # Still not found after installation
library(container) # For Dict class
library(useful) # For simple.impute
library(comprehenr) # For list comprehension
library(GGally)
library(reshape2)
library(gridExtra)
library(gplots)
library(DescTools) # For df summary
library(robustHD) # For df summary
library(caret)
library(effsize) # For Cohen's d
source('tools/wrangle.R')
source('tools/eda.R')
source('tools/engineer.R')
source('tools/split.R')
# SEED = 65466
Here’s the wrangled training set loaded from objects serialized at the end of wrangle_and_split.Rmd.
val_train_Xy = readRDS("data/eda_pt1_val_train_Xy.rds")
id_col = val_train_Xy$Id
val_train_Xy = select(val_train_Xy, -c('Id'))
funcs_lst = list(
'no_func' = function (x) { x },
'sqrt' = sqrt,
'cbrt' = function(x) { x^(1/3) },
'square' = function(x) { x^2 },
###
### FIXME
# Make log transformations of x+1.
###
'log' = log,
'log2' = log2,
'log10' = log10,
'1/x' = function (x) { 1/x },
'2^(1/x)' = function (x) { 2^(1/x) }
# Box Cox: write function that calls MASS::boxcox and include lambda in results/function returned.
# Yeo-Johnson
# Winsorize here?
# Rank
# Rank-Gauss
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
# print("Best normalizing transformations:")
# for (feat in names(best_normalizers)) {
# func_name = best_normalizers[[feat]]$best_func$name
# print(
# paste(
# feat, ":", func_name,
# ", p-value:", best_normalizers[[feat]]$results[[func_name]]$p.value
# )
# )
# }
Mostly gas forced air (GasA, 697). Can probably drop this.
summary(val_train_Xy$Heating)
## None Floor GasA GasW Grav OthW Wall
## 0 0 702 7 2 1 3
val_train_Xy = select(val_train_Xy, -c('Heating'))
Mostly excellent (360), then interestingly more average (211) than good (117). Enough variance to keep, but not normal.
x = 'HeatingQC'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Ex" "TA" "Gd"
662 yes, 53 no. Probably drop this one.
summary(val_train_Xy$CentralAir)
## N Y
## 39 676
val_train_Xy = select(val_train_Xy, -c('CentralAir'))
655 standard circuit breaker and Romex. Probably drop, but there are none mixed, so it might be worth making it an ordered factor if keeping.
summary(val_train_Xy$Electrical)
## None SBrkr FuseA FuseF FuseP Mix
## 0 650 49 14 1 1
val_train_Xy = select(val_train_Xy, -c('Electrical'))
Dropping as component of GrLivArea.
summary(val_train_Xy$X1stFlrSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 881 1086 1167 1392 4692
val_train_Xy = select(val_train_Xy, -c('X1stFlrSF'))
Dropping as component of GrLivArea. The presence of a second floor is encoded in HouseStyle, but it’s also convoluted between a few levels. I’ll make a binary out of it and keep that.
val_train_Xy = val_train_Xy %>%
mutate('X2ndFlr.bin' = ifelse(X2ndFlrSF <= 0, 0, 1)) %>%
mutate('X2ndFlr.bin.fact' = factor(X2ndFlr.bin, ordered = T))
x = 'X2ndFlr.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
x = 'X2ndFlr.bin.fact'
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('X2ndFlr.bin.fact'))
702 0s. Drop this feature, though it would be a useful modifier of GrLivArea as a detracting component of it.
summary(val_train_Xy$LowQualFinSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.846 0.000 528.000
val_train_Xy = select(val_train_Xy, -c('LowQualFinSF'))
As this is polymodal to the number of floors, I’ll start by faceting before transforming.
ggplot(val_train_Xy, aes(x = GrLivArea)) +
geom_histogram(binwidth = 50) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
x = 'GrLivArea'
summary(val_train_Xy[x])
## GrLivArea
## Min. : 334
## 1st Qu.:1129
## Median :1456
## Mean :1515
## 3rd Qu.:1791
## Max. :5642
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = .1
)
## NULL
x = 'log2(GrLivArea)'
val_train_Xy = val_train_Xy %>%
mutate('log2(GrLivArea)' = log2(GrLivArea))
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .1) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## log2(GrLivArea)
## Min. : 8.384
## 1st Qu.:10.141
## Median :10.508
## Mean :10.486
## 3rd Qu.:10.807
## Max. :12.462
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = 1
)
## NULL
x = 'square(log2(GrLivArea))'
val_train_Xy = val_train_Xy %>%
mutate('square(log2(GrLivArea))' = log2(GrLivArea)^2)
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = 1) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## square(log2(GrLivArea))
## Min. : 70.29
## 1st Qu.:102.84
## Median :110.41
## Mean :110.18
## 3rd Qu.:116.78
## Max. :155.30
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
geom_histogram(binwidth = 1) +
facet_wrap(facets = vars(.data$X2ndFlr.bin), ncol = 1)
ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
geom_histogram(binwidth = 1) +
facet_grid(
cols = vars(.data$X2ndFlr.bin),
rows = vars(.data$YearBuilt.fact)
)
It looks like there might be more polymodality, particularly among mid-century builds, but I’ll leave it here. The transformations seem to have better normalized both the full variable and its subsets by storey and age. Winsorization should help both too.
x = 'square(log2(GrLivArea))'
qqnorm(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')
qqline(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')
qqnorm(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')
qqline(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_log2_x_squared = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.002, 0.998),
na.rm = T
)
qqnorm(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')
qqline(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')
Win_log2_x = Winsorize(
x = val_train_Xy$`log2(GrLivArea)`,
probs = c(0.002, 0.998),
na.rm = T
)
qqnorm(y = Win_log2_x, ylab = 'Win_log2_x')
qqline(y = Win_log2_x, ylab = 'Win_log2_x')
Win_raw_x = Winsorize(
x = val_train_Xy$GrLivArea,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$GrLivArea))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$GrLivArea
## W = 0.92124, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log2(GrLivArea)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`log2(GrLivArea)`
## W = 0.99301, p-value = 0.002004
print(shapiro.test(x = val_train_Xy[['square(log2(GrLivArea))']]))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy[["square(log2(GrLivArea))"]]
## W = 0.99284, p-value = 0.001655
print(shapiro.test(x = Win_log2_x_squared))
##
## Shapiro-Wilk normality test
##
## data: Win_log2_x_squared
## W = 0.99603, p-value = 0.06738
print(shapiro.test(x = Win_log2_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log2_x
## W = 0.99606, p-value = 0.06957
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.97112, p-value = 1.136e-10
Winsorizing the log2 better normalized than squaring it, but Winsorizing the squared log2 is close, too. May be overfit to add the square.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log2(GrLivArea))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)),
probs = c(0.002, 0.998),
na.rm = T
)
) %>%
mutate(
'Win(square(log2(GrLivArea)))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
probs = c(0.002, 0.998),
na.rm = T
)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GrLivArea', 'log2(GrLivArea)', 'square(log2(GrLivArea))', 'Win(log2(GrLivArea))', 'Win(square(log2(GrLivArea)))')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## GrLivArea log2(GrLivArea) square(log2(GrLivArea))
## Min. :0.004074 Min. :0.007031 Min. :0.005632
## 1st Qu.:0.170723 1st Qu.:0.138405 1st Qu.:0.144044
## Median :0.300589 Median :0.308047 Median :0.308343
## Mean :0.320766 Mean :0.319385 Mean :0.320768
## 3rd Qu.:0.444557 3rd Qu.:0.439274 3rd Qu.:0.442560
## Max. :0.829900 Max. :0.826904 Max. :0.831350
## Win(log2(GrLivArea)) Win(square(log2(GrLivArea)))
## Min. :0.007115 Min. :0.005648
## 1st Qu.:0.127613 1st Qu.:0.128787
## Median :0.306231 Median :0.304675
## Mean :0.317085 Mean :0.318136
## 3rd Qu.:0.428208 3rd Qu.:0.429922
## Max. :0.827504 Max. :0.831948
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
The Winsorized double transformation may be slightly overfitting and overcomputing, but it’s slightly (likely insignificantly) more normal and correlated to SalePrice than the Winsorized single transformation. I’ll go with it despite common sense, just for fun.
x = 'Win(square(log2(GrLivArea)))'
min_val = min(Win_log2_x_squared)
max_val = max(Win_log2_x_squared)
print(paste("min_val:", min_val))
## [1] "min_val: 78.8827556705776"
print(paste("max_val:", max_val))
## [1] "max_val: 143.236548514549"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log2(GrLivArea)))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('log2(GrLivArea)', 'Win(log2(GrLivArea))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
I could make two features, total baths above grade and total basement baths, but I’ll keep it simple.
val_train_Xy = val_train_Xy %>%
mutate(TotBaths = FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath)
x = 'TotBaths'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
ggplot(val_train_Xy, aes(x = factor(TotBaths), y = log(SalePrice))
) +
geom_jitter(alpha = 0.5, aes(color = factor(BsmtFullBath + BsmtHalfBath))) +
geom_boxplot(
notch = T,
notchwidth = .1,
varwidth = T,
alpha = 0,
color = 'blue'
) +
geom_violin(alpha = 0) +
geom_line(
stat = 'summary',
fun = quantile,
fun.args = list(probs = .9),
linetype = 2, aes(group = 1)
) +
geom_line(stat = 'summary', fun = mean, mapping = aes(group = 1)) +
geom_line(
stat = 'summary',
fun = quantile,
fun.args = list(probs = .1),
linetype = 2, aes(group = 1)
) +
xlab(label = 'TotBaths') + ylab(label = 'log(SalePrice)')
Basement bathrooms don’t seem to add much value when there aren’t many bathrooms altogether.
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "3.5" "2" "3" "1" "1.5" "2.5"
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## TotBaths
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.217
## 3rd Qu.:2.500
## Max. :6.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.5,
t_binw = 0.1
)
## NULL
x = 'sqrt(TotBaths)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(TotBaths)' = sqrt(TotBaths))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## sqrt(TotBaths)
## Min. :1.000
## 1st Qu.:1.414
## Median :1.414
## Mean :1.464
## 3rd Qu.:1.581
## Max. :2.449
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
## NULL
x = 'sqrt(TotBaths)'
qqnorm(y = val_train_Xy$TotBaths, ylab = 'TotBaths')
qqline(y = val_train_Xy$TotBaths, ylab = 'TotBaths')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.0, 0.999),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = val_train_Xy$TotBaths,
probs = c(0, 0.998),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$TotBaths))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$TotBaths
## W = 0.92874, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(TotBaths)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`sqrt(TotBaths)`
## W = 0.9241, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_sqrt_x
## W = 0.92391, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.93136, p-value < 2.2e-16
Maybe just lightly top-coding the raw variable is best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(TotBaths))' = Winsorize(
sqrt(TotBaths),
probs = c(0, 0.999),
na.rm = T
)
) %>%
mutate(
'Win(TotBaths)' = Winsorize(
TotBaths,
probs = c(0, 0.998),
na.rm = T
)
)
x = 'Win(TotBaths)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotBaths', 'sqrt(TotBaths)', 'Win(sqrt(TotBaths))', 'Win(TotBaths)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## TotBaths sqrt(TotBaths) Win(sqrt(TotBaths)) Win(TotBaths)
## Min. :0.004047 Min. :0.001146 Min. :0.0004721 Min. :0.002175
## 1st Qu.:0.157148 1st Qu.:0.155984 1st Qu.:0.1568243 1st Qu.:0.160011
## Median :0.329253 Median :0.331356 Median :0.3319365 Median :0.331592
## Mean :0.329428 Mean :0.328623 Mean :0.3294207 Mean :0.332448
## 3rd Qu.:0.487119 3rd Qu.:0.483214 3rd Qu.:0.4844607 3rd Qu.:0.493532
## Max. :0.687363 Max. :0.687107 Max. :0.6868680 Max. :0.688339
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(TotBaths)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 1"
print(paste("max_val:", max_val))
## [1] "max_val: 4.786"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotBaths)' = Winsorize(
FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath,
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('sqrt(TotBaths)', 'Win(sqrt(TotBaths))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 0.5)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'BedroomAbvGr'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
df = select(val_train_Xy, c('log(SalePrice)', 'BedroomAbvGr'))
df$BedroomAbvGr.fact = factor(df$BedroomAbvGr)
sum_and_trans_fact(data = df, x = 'BedroomAbvGr.fact', y = y)
## NULL
p_vals = get_signif_levels(
data = df,
x = 'BedroomAbvGr.fact',
z = y,
min_n = 30
)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "3" "2" "4"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## BedroomAbvGr
## Min. :0.004635
## 1st Qu.:0.041301
## Median :0.090023
## Mean :0.151626
## 3rd Qu.:0.209621
## Max. :0.654507
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.5,
t_binw = 0.1
)
## NULL
x = 'sqrt(BedroomAbvGr)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(BedroomAbvGr)' = sqrt(BedroomAbvGr))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## sqrt(BedroomAbvGr)
## Min. :1.000
## 1st Qu.:1.414
## Median :1.732
## Mean :1.682
## 3rd Qu.:1.732
## Max. :2.828
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
## NULL
x = 'sqrt(BedroomAbvGr)'
qqnorm(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')
qqline(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = val_train_Xy$BedroomAbvGr,
probs = c(0, 0.995),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$BedroomAbvGr))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$BedroomAbvGr
## W = 0.8457, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(BedroomAbvGr)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`sqrt(BedroomAbvGr)`
## W = 0.84731, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_sqrt_x
## W = 0.8486, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.8574, p-value < 2.2e-16
Maybe just lightly Winsorizing the raw variable is best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(BedroomAbvGr))' = Winsorize(
sqrt(BedroomAbvGr),
probs = c(0, 0.999),
na.rm = T
)
) %>%
mutate(
'Win(BedroomAbvGr)' = Winsorize(
BedroomAbvGr,
probs = c(0, 0.995),
na.rm = T
)
)
x = 'Win(BedroomAbvGr)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr', 'sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))', 'Win(BedroomAbvGr)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## BedroomAbvGr sqrt(BedroomAbvGr) Win(sqrt(BedroomAbvGr))
## Min. :0.004635 Min. :0.001827 Min. :0.002047
## 1st Qu.:0.041301 1st Qu.:0.044497 1st Qu.:0.044839
## Median :0.090023 Median :0.094369 Median :0.094250
## Mean :0.151626 Mean :0.150984 Mean :0.151281
## 3rd Qu.:0.209621 3rd Qu.:0.216792 3rd Qu.:0.217943
## Max. :0.654507 Max. :0.638464 Max. :0.635445
## Win(BedroomAbvGr)
## Min. :0.003767
## 1st Qu.:0.041751
## Median :0.091144
## Mean :0.154296
## 3rd Qu.:0.219703
## Max. :0.649031
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(BedroomAbvGr)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 1"
print(paste("max_val:", max_val))
## [1] "max_val: 5.42999999999995"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(BedroomAbvGr)' = Winsorize(
BedroomAbvGr,
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
summary(val_train_Xy$KitchenAbvGr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.041 1.000 3.000
val_train_Xy = select(val_train_Xy, -c('KitchenAbvGr'))
x = 'KitchenQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA" "Ex"
x = 'TotRmsAbvGrd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
df = select(val_train_Xy, c('log(SalePrice)', 'TotRmsAbvGrd'))
df$TotRmsAbvGrd.fact = factor(df$TotRmsAbvGrd)
sum_and_trans_fact(data = df, x = 'TotRmsAbvGrd.fact', y = y)
## NULL
p_vals = get_signif_levels(
data = df,
x = 'TotRmsAbvGrd.fact',
z = y,
min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "8" "5" "6" "4" "7" "9"
summary(val_train_Xy[x])
## TotRmsAbvGrd
## Min. : 2.000
## 1st Qu.: 5.000
## Median : 6.000
## Mean : 6.543
## 3rd Qu.: 7.000
## Max. :14.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## TotRmsAbvGrd
## Min. :0.00436
## 1st Qu.:0.10320
## Median :0.22718
## Mean :0.28644
## 3rd Qu.:0.40813
## Max. :0.83195
x = 'TotRmsAbvGrd'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_raw_x = Winsorize(
x = val_train_Xy$TotRmsAbvGrd,
probs = c(0, 0.975),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')
qqline(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')
print(shapiro.test(x = val_train_Xy$TotRmsAbvGrd))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$TotRmsAbvGrd
## W = 0.93616, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.94556, p-value = 1.508e-15
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotRmsAbvGrd)' = Winsorize(
TotRmsAbvGrd,
probs = c(0, 0.975),
na.rm = T
)
)
x = 'Win(TotRmsAbvGrd)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd', 'Win(TotRmsAbvGrd)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## TotRmsAbvGrd Win(TotRmsAbvGrd)
## Min. :0.00436 Min. :0.006598
## 1st Qu.:0.10320 1st Qu.:0.092855
## Median :0.22718 Median :0.224392
## Mean :0.28644 Mean :0.286050
## 3rd Qu.:0.40813 3rd Qu.:0.414866
## Max. :0.83195 Max. :0.833412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(TotRmsAbvGrd)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 2"
print(paste("max_val:", max_val))
## [1] "max_val: 10.15"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotRmsAbvGrd)' = Winsorize(
TotRmsAbvGrd,
# probs = c(0, 0.975),
# na.rm = T
minval = min_val,
maxval = max_val
)
)
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'Functional'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
x = 'Fireplaces'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.fact = factor(val_train_Xy$Fireplaces, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.fact', y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "2" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## Fireplaces
## Min. :0.0001063
## 1st Qu.:0.1125161
## Median :0.2436102
## Mean :0.2242305
## 3rd Qu.:0.3210328
## Max. :0.4867665
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))
## NULL
The variable needs more than three unique values in order to use my function to check for transformations, and 0 can’t be one of them in order to include log transformations. So, I’ll use Fireplaces + 1 to search for transformations.
val_train_Xy$Fireplaces_plus1 = val_train_Xy$Fireplaces + 1
x = 'Fireplaces_plus1'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
x = 'Fireplaces'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_raw_x = Winsorize(
x = val_train_Xy$Fireplaces,
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$Fireplaces))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$Fireplaces
## W = 0.76773, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.76773, p-value < 2.2e-16
Winsorization doesn’t help.
There’s no apparent meaningful difference in price between houses with 1 fireplace and those with 2 or 3. It’s significant (p < 0.1), but not apparently meaningful given the overlapping notches in the boxplot above, though I did not take Cohen’s d for effect size.
Binarization may condense the feature space without a lot of information loss. In fact, it improves the correlation to the target variable.
val_train_Xy = val_train_Xy %>%
mutate(Fireplaces.bin = ifelse(Fireplaces == 0, 0, 1))
x = 'Fireplaces.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.bin.fact = factor(val_train_Xy$Fireplaces.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.bin.fact', y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.bin.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces', 'Fireplaces.bin')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## Fireplaces Fireplaces.bin
## Min. :0.0001063 Min. :0.002832
## 1st Qu.:0.1163174 1st Qu.:0.138881
## Median :0.2436660 Median :0.220999
## Mean :0.2380836 Mean :0.237974
## 3rd Qu.:0.3215369 3rd Qu.:0.299826
## Max. :1.0000000 Max. :0.886894
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
val_train_Xy = select(
val_train_Xy,
-c('Fireplaces.fact', 'Fireplaces_plus1', 'Fireplaces.bin.fact',
'Fireplaces')
)
x = 'FireplaceQu'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None" "TA" "Gd"
One-hot encoding this FireplaceQu will include a binarized version (“None”:int[0,1]), so we can drop Fireplace.bin altogether, and Fireplace because it just adds noise. I’ll take care of that during modeling with caret.
x = 'GarageType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Attchd" "Detchd" "None" "BuiltIn"
Similar distribution as year built for house. May not add additional information, but may be useful in interaction with type and finish. Could drop and use YearBuilt as proxy but would lose some info.
x = 'GarageYrBlt'
summary(val_train_Xy[x])
## GarageYrBlt
## Min. :1908
## 1st Qu.:1961
## Median :1979
## Mean :1978
## 3rd Qu.:2002
## Max. :2010
## NA's :40
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
gg = ggplot(val_train_Xy, aes(x = GarageYrBlt))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'GarageYrBlt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## GarageYrBlt
## Min. :0.006671
## 1st Qu.:0.057040
## Median :0.177159
## Mean :0.235013
## 3rd Qu.:0.313315
## Max. :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
Even “controlling” for the year the house was built, GarageYrBlt doesn’t predict sale prices well, as seen in the next few plots. I’ll just drop it.
val_train_Xy = val_train_Xy %>%
mutate(
'GarBltLater' = factor(ifelse((GarageYrBlt - YearBuilt) <= 0, 0, 1))
)
ggplot(val_train_Xy, aes(x = GarageYrBlt, y = YearBuilt)) +
geom_jitter(
alpha = 0.5,
aes(
color = SalePrice.fact
),
position = position_jitter(w = 1, h = 1)
)
fbv = fenced_jbv(
data = val_train_Xy,
x = 'GarBltLater',
y = 'log(SalePrice)'
)
fbv
fbv +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
df = filter(
val_train_Xy,
GarageYrBlt - YearBuilt != 0
)
ggplot(df, aes(x = GarageYrBlt, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = 'log(SalePrice) ~ YearBuilt Residuals')
print(
paste(
"Correlation of GarageYRBlt to Age residual:",
cor(df$GarageYrBlt, df$resids)
)
)
## [1] "Correlation of GarageYRBlt to Age residual: 0.0701692753019878"
val_train_Xy = select(val_train_Xy, -c('GarageYrBlt', 'GarBltLater'))
x = 'GarageFinish'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "RFn" "Unf" "None" "Fin"
x = 'GarageCars'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
val_train_Xy$GarageCars.fact = factor(
val_train_Xy$GarageCars,
ordered = T
)
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.fact', y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'GarageCars.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "2" "1" "3" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## GarageCars
## Min. :0.005684
## 1st Qu.:0.162049
## Median :0.279956
## Mean :0.293729
## 3rd Qu.:0.408933
## Max. :0.870258
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))
## NULL
val_train_Xy$GarageCars.plus1 = val_train_Xy$GarageCars + 1
x = 'GarageCars.plus1'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
x = 'GarageCars'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
val_train_Xy$`Win(GarageCars)` = Winsorize(
x = val_train_Xy[[x]],
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')
qqline(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy[[x]]))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy[[x]]
## W = 0.83694, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`Win(GarageCars)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`Win(GarageCars)`
## W = 0.83446, p-value < 2.2e-16
val_train_Xy = val_train_Xy %>%
mutate(GarageCars.bin = ifelse(GarageCars == 0, 0, 1))
x = 'GarageCars.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$GarageCars.bin.fact =
factor(val_train_Xy$GarageCars.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.bin.fact', y = y)
## NULL
p_vals = get_signif_levels(
data = val_train_Xy, x = 'GarageCars.bin.fact', z = y, min_n = 30
)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageCars', 'Win(GarageCars)', 'GarageCars.bin')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## GarageCars Win(GarageCars) GarageCars.bin
## Min. :0.005684 Min. :0.005378 Min. :0.005967
## 1st Qu.:0.163144 1st Qu.:0.163484 1st Qu.:0.051718
## Median :0.282048 Median :0.283037 Median :0.090024
## Mean :0.306341 Mean :0.307958 Mean :0.118027
## 3rd Qu.:0.419101 3rd Qu.:0.421339 3rd Qu.:0.148698
## Max. :1.000000 Max. :0.999361 Max. :0.568588
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
That didn’t seem to help.
Polymodal in interaction with GarageCars.
x = 'GarageArea'
summary(val_train_Xy[x])
## GarageArea
## Min. : 0.0
## 1st Qu.: 312.0
## Median : 480.0
## Mean : 469.3
## 3rd Qu.: 576.0
## Max. :1418.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1
)
## NULL
x = 'sqrt(GarageArea)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(GarageArea)' = sqrt(GarageArea))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## sqrt(GarageArea)
## Min. : 0.00
## 1st Qu.:17.66
## Median :21.91
## Mean :20.68
## 3rd Qu.:24.00
## Max. :37.66
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
x = 'sqrt(GarageArea)'
df = filter(val_train_Xy, GarageArea != 0)
qqnorm(y = df$GarageArea, ylab = 'GarageArea')
qqline(y = df$GarageArea, ylab = 'GarageArea')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = df[[x]],
probs = c(0, 0.998),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = df$GarageArea,
probs = c(0, 0.987),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$GarageArea))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$GarageArea
## W = 0.97888, p-value = 1.218e-08
print(shapiro.test(x = val_train_Xy$`sqrt(GarageArea)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`sqrt(GarageArea)`
## W = 0.85021, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_sqrt_x
## W = 0.98761, p-value = 1.749e-05
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.97029, p-value = 1.824e-10
min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(GarageArea))' = ifelse(
GarageArea == 0,
0,
Winsorize(
sqrt(GarageArea),
minval = min_val,
maxval = max_val
)
)
)
x = 'Win(sqrt(GarageArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageArea', 'sqrt(GarageArea)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## GarageArea sqrt(GarageArea) Win(sqrt(GarageArea))
## Min. :0.006474 Min. :0.001747 Min. :0.001705
## 1st Qu.:0.139205 1st Qu.:0.121113 1st Qu.:0.121213
## Median :0.324645 Median :0.268493 Median :0.264631
## Mean :0.317963 Mean :0.287737 Mean :0.286712
## 3rd Qu.:0.430672 3rd Qu.:0.375451 3rd Qu.:0.372716
## Max. :0.873194 Max. :0.875112 Max. :0.876250
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## GarageArea sqrt(GarageArea) Win(sqrt(GarageArea))
## Min. :0.002476 Min. :0.002494 Min. :0.002863
## 1st Qu.:0.128551 1st Qu.:0.144903 1st Qu.:0.145087
## Median :0.337145 Median :0.322871 Median :0.319261
## Mean :0.315598 Mean :0.315749 Mean :0.314570
## 3rd Qu.:0.456501 3rd Qu.:0.470921 3rd Qu.:0.470155
## Max. :0.819002 Max. :0.837390 Max. :0.840302
## NA's :1 NA's :1 NA's :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
Transforming and Winsorizing doesn’t help the overall correlation to SalePrice. Let’s facet it out and see.
df = filter(
val_train_Xy,
(GarageType %in% c('Attchd', 'Detchd', 'BuiltIn')) &
(GarageCars != 4) # Ony one point in this subset with 4 and it doesn't really change the correlations, but it saves viz space.
)
ggplot(df, aes(x = `Win(sqrt(GarageArea))`, y = `log(SalePrice)`)) +
geom_jitter(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_grid(rows = vars(GarageType), cols = vars(GarageCars.fact))
sum_df0 = df %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
min_n = 30
# sum_df1 = df %>%
# group_by(GarageCars.fact, GarageType) %>%
# summarize(
# n = n(),
# GarageArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`GarageArea`
# ),
# sqrtArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`sqrt(GarageArea)`
# ),
# WinsqrtArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`Win(sqrt(GarageArea))`
# )
# ) %>%
# filter(n >= min_n)
sum_df2 = df %>% group_by(GarageType) %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
) %>%
filter(n >= min_n)
sum_df3 = df %>% group_by(GarageCars.fact) %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
) %>%
filter(n >= min_n)
sum_df0 %>%
merge(y = sum_df2, all = T) %>%
merge(y = sum_df3, all = T) %>%
# merge(y = sum_df1, all = T) %>%
select(
c('GarageType', 'GarageCars.fact', 'n', 'GarageArea_cor',
'sqrtArea_cor', 'WinsqrtArea_cor')
) %>%
arrange(GarageType, GarageCars.fact)
Transforming and Winsorizing GarageArea improves correlation to the target variable (“Win(log(SalePrice))”) somewhat when excluding houses without garages. Grouping by garage type helps in some cases.
Whether to transform the variable or not may depend on which ML algorithm we’re using and how. A decision tree will likely be indifferent to tranformations, though it may benefit from noise reduction with Winsorization. A linear regression will only benefit if type and/or missingness is factored in, as would KNN.
Grouping by number of cars lowers the area:price correlation to no correlation. And, in fact, number of cars has a stronger correlation to the target variable than area does. Let’s see how grouping by type further improves that correlation.
df = filter(
val_train_Xy,
(GarageType %in% c('Attchd', 'Detchd', 'BuiltIn'))
)
ggplot(df, aes(x = GarageCars, y = `log(SalePrice)`)) +
geom_jitter(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(facets = vars(GarageType))
sum_df0 = df %>%
summarize(
n = n(),
Cars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$GarageCars
),
WinCars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(GarageCars)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
sum_df2 = df %>% group_by(GarageType) %>%
summarize(
n = n(),
Cars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$GarageCars
),
WinCars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(GarageCars)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
sum_df0 %>%
merge(y = sum_df2, all = T) %>%
select(
c('GarageType', 'n', 'Cars_cor', 'WinCars_cor', 'WinsqrtArea_cor')
) %>%
arrange(GarageType)
It looks worth dropping GarageArea in favor of GarageCars. There is an argument against linear regression using discrete variables, but it seems to work nonetheless.
Winsorizing GarageCars only produces a marginal benefit which may prove spurious as it only adjusts one point. Plus, it reduces normality. So, I’ll just use the raw feature.
There are likely other features at play, like year built, that also affect things like the difference in the prices of two-car attached garages and two-car detached garages.
fenced_jbv(
data = df[df$GarageCars != 4, ],
x = 'GarageType',
y = 'log(SalePrice)',
jit_col = 'YearBuilt.fact',
leg_lbl = 'Year Built'
) +
facet_wrap(facets = vars(GarageCars.fact))
val_train_Xy = select(
val_train_Xy,
-c('GarageCars.fact', 'GarageCars.plus1', 'GarageCars.bin',
'GarageCars.bin.fact', 'GarageArea', 'sqrt(GarageArea)',
'Win(sqrt(GarageArea))')
)
summary(val_train_Xy[ , c('GarageQual', 'GarageCond')])
## GarageQual GarageCond
## None: 40 None: 40
## Po : 1 Po : 4
## Fa : 29 Fa : 20
## TA :633 TA :646
## Gd : 11 Gd : 4
## Ex : 1 Ex : 1
val_train_Xy = select(val_train_Xy, -c('GarageQual', 'GarageCond'))
summary(val_train_Xy$PavedDrive)
## None N P Y
## 0 53 19 643
val_train_Xy = select(val_train_Xy, -c('PavedDrive'))
x = 'WoodDeckSF'
summary(val_train_Xy[x])
## WoodDeckSF
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 96.78
## 3rd Qu.:175.50
## Max. :736.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$WoodDeckSF != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = .1
)
## NULL
x = 'cbrt(WoodDeckSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(WoodDeckSF)' = (WoodDeckSF)^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(WoodDeckSF)
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :2.712
## 3rd Qu.:5.599
## Max. :9.029
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
## NULL
df = filter(val_train_Xy, WoodDeckSF != 0)
qqnorm(y = df$WoodDeckSF, ylab = 'WoodDeckSF')
qqline(y = df$WoodDeckSF, ylab = 'WoodDeckSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.01, 0.99),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)
Win_raw_x = Winsorize(
x = df$WoodDeckSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'WoodDeckSF')
qqline(y = Win_raw_x, ylab = 'WoodDeckSF')
print(shapiro.test(x = df$WoodDeckSF))
##
## Shapiro-Wilk normality test
##
## data: df$WoodDeckSF
## W = 0.88365, p-value = 1.951e-15
print(shapiro.test(x = df[[x]]))
##
## Shapiro-Wilk normality test
##
## data: df[[x]]
## W = 0.98712, p-value = 0.003914
print(shapiro.test(x = Win_cbrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_cbrt_x
## W = 0.98648, p-value = 0.002765
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.93225, p-value = 2.343e-11
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(WoodDeckSF))' = ifelse(
WoodDeckSF == 0,
0,
Winsorize(
WoodDeckSF^(1/3),
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
) %>%
mutate(
'Win(WoodDeckSF)' = ifelse(
WoodDeckSF == 0,
0,
Winsorize(
WoodDeckSF,
minval = min(Win_raw_x),
maxval = max(Win_raw_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('WoodDeck.bin' = ifelse(WoodDeckSF == 0, 0, 1)) %>%
mutate('WoodDeck.bin.fact' = factor(WoodDeck.bin, ordered = T))
x = 'WoodDeck.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('WoodDeckSF', 'cbrt(WoodDeckSF)', 'Win(cbrt(WoodDeckSF))',
'Win(WoodDeckSF)', 'WoodDeck.bin')
x = 'Win(cbrt(WoodDeckSF))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## WoodDeckSF cbrt(WoodDeckSF) Win(cbrt(WoodDeckSF)) Win(WoodDeckSF)
## Min. :0.006145 Min. :0.004212 Min. :0.004404 Min. :0.001863
## 1st Qu.:0.084695 1st Qu.:0.069548 1st Qu.:0.069704 1st Qu.:0.082133
## Median :0.176322 Median :0.161833 Median :0.161822 Median :0.179367
## Mean :0.160453 Mean :0.165207 Mean :0.165111 Mean :0.163279
## 3rd Qu.:0.234533 3rd Qu.:0.252250 3rd Qu.:0.251680 3rd Qu.:0.242096
## Max. :0.328898 Max. :0.355420 Max. :0.355488 Max. :0.339011
## WoodDeck.bin
## Min. :0.002522
## 1st Qu.:0.055760
## Median :0.161167
## Mean :0.149999
## 3rd Qu.:0.228064
## Max. :0.330345
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## WoodDeckSF cbrt(WoodDeckSF) Win(cbrt(WoodDeckSF)) Win(WoodDeckSF)
## Min. :0.001777 Min. :0.002974 Min. :0.004668 Min. :0.001231
## 1st Qu.:0.060621 1st Qu.:0.056354 1st Qu.:0.056724 1st Qu.:0.060845
## Median :0.112829 Median :0.144985 Median :0.142206 Median :0.122578
## Mean :0.114102 Mean :0.123596 Mean :0.124123 Mean :0.116930
## 3rd Qu.:0.157044 3rd Qu.:0.172657 3rd Qu.:0.169070 3rd Qu.:0.155611
## Max. :0.285351 Max. :0.298849 Max. :0.302455 Max. :0.304426
##
## WoodDeck.bin
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :55
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
# plot_scat_pairs(
# df = filter(val_train_Xy, WoodDeckSF != 0),
# x = feat,
# y_lst = y_lst
# )
}
There’s no real correlation between deck square footage and the target price when you remove houses without decks; the binary version does most of the work. But, the transformation does it a little better and does offer more normalized distance between points for KNN.
x = 'Win(cbrt(WoodDeckSF))'
min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
print(paste("min_val:", min_val))
## [1] "min_val: 2.99287415882937"
print(paste("max_val:", max_val))
## [1] "max_val: 8.48252791006478"
# Already hard coded above
val_train_Xy = select(
val_train_Xy,
-c('WoodDeck.bin', 'WoodDeck.bin.fact', 'Win(WoodDeckSF)')
)
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .25)
x = 'OpenPorchSF'
summary(val_train_Xy[x])
## OpenPorchSF
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 27.00
## Mean : 47.46
## 3rd Qu.: 72.00
## Max. :547.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$OpenPorchSF != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.1
)
## NULL
x = 'cbrt(OpenPorchSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(OpenPorchSF)' = OpenPorchSF^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(OpenPorchSF)
## Min. :0.000
## 1st Qu.:0.000
## Median :3.000
## Mean :2.276
## 3rd Qu.:4.160
## Max. :8.178
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.1,
t_binw = 0.1
)
## NULL
df = filter(val_train_Xy, OpenPorchSF != 0)
x = 'cbrt(OpenPorchSF)'
qqnorm(y = df$OpenPorchSF, ylab = 'OpenPorchSF')
qqline(y = df$OpenPorchSF, ylab = 'OpenPorchSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.001, 0.994),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)
Win_raw_x = Winsorize(
x = df$OpenPorchSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'OpenPorchSF')
qqline(y = Win_raw_x, ylab = 'OpenPorchSF')
print(shapiro.test(x = df$OpenPorchSF))
##
## Shapiro-Wilk normality test
##
## data: df$OpenPorchSF
## W = 0.79866, p-value < 2.2e-16
print(shapiro.test(x = df[[x]]))
##
## Shapiro-Wilk normality test
##
## data: df[[x]]
## W = 0.97116, p-value = 5.932e-07
print(shapiro.test(x = Win_cbrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_cbrt_x
## W = 0.97391, p-value = 1.921e-06
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.89182, p-value = 6.251e-16
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(OpenPorchSF))' = ifelse(
OpenPorchSF == 0,
0,
Winsorize(
OpenPorchSF^(1/3),
# probs = c(0.001, 0.994),
# na.rm = T
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
)
Looks like some polymodality happening. Year built doesn’t seem to explain it. I’m not going to manually search for it any further, but a decision tree or other ML algorithm may find and “factor” in the hidden interaction.
ggplot(
filter(val_train_Xy, OpenPorchSF != 0),
aes(x = `cbrt(OpenPorchSF)`)
) +
geom_histogram(binwidth = 0.1) +
facet_wrap(facets = vars(YearBuilt.fact), ncol = 1)
ggplot(
filter(val_train_Xy, OpenPorchSF != 0),
aes(x = `cbrt(OpenPorchSF)`, y = `log(SalePrice)`)
) +
geom_point(aes(color = YearBuilt))
val_train_Xy = val_train_Xy %>%
mutate('OpenPorch.bin' = ifelse(OpenPorchSF == 0, 0, 1)) %>%
mutate('OpenPorch.bin.fact' = factor(OpenPorch.bin, ordered = T))
x = 'OpenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('OpenPorchSF', 'cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))',
'OpenPorch.bin')
x = 'Win(cbrt(OpenPorchSF))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## OpenPorchSF cbrt(OpenPorchSF) Win(cbrt(OpenPorchSF))
## Min. :0.004127 Min. :0.003166 Min. :0.002332
## 1st Qu.:0.059251 1st Qu.:0.090986 1st Qu.:0.091328
## Median :0.142723 Median :0.167835 Median :0.168485
## Mean :0.151759 Mean :0.202395 Mean :0.202442
## 3rd Qu.:0.232678 3rd Qu.:0.325809 3rd Qu.:0.326224
## Max. :0.339325 Max. :0.460438 Max. :0.459971
## OpenPorch.bin
## Min. :0.000127
## 1st Qu.:0.082138
## Median :0.152291
## Mean :0.203538
## 3rd Qu.:0.356361
## Max. :0.465546
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## OpenPorchSF cbrt(OpenPorchSF) Win(cbrt(OpenPorchSF)) OpenPorch.bin
## Min. :0.002912 Min. :0.0005588 Min. :0.001252 Min. : NA
## 1st Qu.:0.037026 1st Qu.:0.0371173 1st Qu.:0.033447 1st Qu.: NA
## Median :0.070241 Median :0.0705708 Median :0.067662 Median : NA
## Mean :0.074408 Mean :0.0750998 Mean :0.073149 Mean :NaN
## 3rd Qu.:0.106605 3rd Qu.:0.1177230 3rd Qu.:0.112676 3rd Qu.: NA
## Max. :0.192231 Max. :0.2019503 Max. :0.191297 Max. : NA
## NA's :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
In this case, the binary variable is doing all the work.
val_train_Xy = select(
val_train_Xy,
-c('cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))', 'OpenPorch.bin.fact')
)
x = 'EnclosedPorch'
summary(val_train_Xy[x])
## EnclosedPorch
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 19.09
## 3rd Qu.: 0.00
## Max. :294.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$EnclosedPorch != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5,
t_binw = 5
)
## NULL
df = filter(val_train_Xy, EnclosedPorch != 0)
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_raw_x = Winsorize(
x = df$EnclosedPorch,
probs = c(0.001, 0.999),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = df[[x]]))
##
## Shapiro-Wilk normality test
##
## data: df[[x]]
## W = 0.98306, p-value = 0.2587
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.9827, p-value = 0.2437
val_train_Xy = val_train_Xy %>%
mutate('EnclosedPorch.bin' = ifelse(EnclosedPorch == 0, 0, 1)) %>%
mutate(
'EnclosedPorch.bin.fact' = factor(
EnclosedPorch.bin,
ordered = T
)
)
x = 'EnclosedPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('EnclosedPorch', 'EnclosedPorch.bin')
x = 'EnclosedPorch'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## EnclosedPorch EnclosedPorch.bin
## Min. :0.004006 Min. :0.001413
## 1st Qu.:0.025521 1st Qu.:0.043374
## Median :0.069574 Median :0.095328
## Mean :0.087443 Mean :0.113655
## 3rd Qu.:0.138839 3rd Qu.:0.165249
## Max. :0.371542 Max. :0.458928
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = c(x),
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## EnclosedPorch
## Min. :0.005257
## 1st Qu.:0.089806
## Median :0.169637
## Mean :0.184809
## 3rd Qu.:0.248067
## Max. :0.474650
## NA's :3
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
Houses with enclosed porches are significantly cheaper than those without, maybe due to a confounding variable like year built. There is a weak, if existent, positive correlation between square footage and price within those that have them.
fenced_jbv(
data = val_train_Xy,
x = 'EnclosedPorch.bin.fact',
y = 'log(SalePrice)',
jit_col = 'YearBuilt',
jit_alpha = 1
) +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
ggplot(val_train_Xy, aes(x = EnclosedPorch, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
val_train_Xy = select(
val_train_Xy, -c('EnclosedPorch.bin', 'EnclosedPorch.bin.fact')
)
summary(val_train_Xy$X3SsnPorch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.505 0.000 508.000
val_train_Xy = select(val_train_Xy, -c('X3SsnPorch'))
x = 'ScreenPorch'
summary(val_train_Xy[x])
## ScreenPorch
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 16.48
## 3rd Qu.: 0.00
## Max. :480.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$ScreenPorch != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.1
)
## NULL
x = 'cbrt(ScreenPorch)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(ScreenPorch)' = ScreenPorch^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(ScreenPorch)
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4832
## 3rd Qu.:0.0000
## Max. :7.8297
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$`cbrt(ScreenPorch)` != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.1,
t_binw = 0.1
)
## NULL
x = 'cbrt(ScreenPorch)'
df = filter(val_train_Xy, ScreenPorch != 0)
qqnorm(y = df$ScreenPorch, ylab = 'ScreenPorch')
qqline(y = df$ScreenPorch)
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]])
Win_cbrt_x = Winsorize(
df[[x]],
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win_cbrt_x')
qqline(y = Win_cbrt_x)
Win_raw_x = Winsorize(
df$ScreenPorch,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)
print(shapiro.test(x = df$ScreenPorch))
##
## Shapiro-Wilk normality test
##
## data: df$ScreenPorch
## W = 0.92914, p-value = 0.001652
print(shapiro.test(x = df[[x]]))
##
## Shapiro-Wilk normality test
##
## data: df[[x]]
## W = 0.97514, p-value = 0.2487
print(shapiro.test(x = Win_cbrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_cbrt_x
## W = 0.96719, p-value = 0.1008
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.92848, p-value = 0.001547
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(ScreenPorch))' = ifelse(
ScreenPorch == 0,
0,
Winsorize(
ScreenPorch^(1/3),
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('ScreenPorch.bin' = ifelse(ScreenPorch ==0, 0, 1)) %>%
mutate('ScreenPorch.bin.fact' = factor(ScreenPorch.bin, ordered = T))
x = 'ScreenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('ScreenPorch', 'cbrt(ScreenPorch)', 'Win(cbrt(ScreenPorch))',
'ScreenPorch.bin')
x = 'Win(cbrt(ScreenPorch))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## ScreenPorch cbrt(ScreenPorch) Win(cbrt(ScreenPorch))
## Min. :0.000127 Min. :0.001702 Min. :0.0007076
## 1st Qu.:0.033205 1st Qu.:0.032575 1st Qu.:0.0310287
## Median :0.052721 Median :0.054455 Median :0.0552326
## Mean :0.062430 Mean :0.060759 Mean :0.0601016
## 3rd Qu.:0.074751 3rd Qu.:0.083482 3rd Qu.:0.0840852
## Max. :0.225009 Max. :0.226688 Max. :0.2256480
## ScreenPorch.bin
## Min. :0.001074
## 1st Qu.:0.026853
## Median :0.056082
## Mean :0.057789
## 3rd Qu.:0.086152
## Max. :0.220821
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## ScreenPorch cbrt(ScreenPorch) Win(cbrt(ScreenPorch)) ScreenPorch.bin
## Min. :0.01673 Min. :0.003906 Min. :0.0138 Min. : NA
## 1st Qu.:0.09574 1st Qu.:0.106601 1st Qu.:0.1020 1st Qu.: NA
## Median :0.14405 Median :0.143076 Median :0.1361 Median : NA
## Mean :0.14621 Mean :0.159218 Mean :0.1533 Mean :NaN
## 3rd Qu.:0.20818 3rd Qu.:0.233239 3rd Qu.:0.2118 3rd Qu.: NA
## Max. :0.37341 Max. :0.325564 Max. :0.2796 Max. : NA
## NA's :1 NA's :1 NA's :1 NA's :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
The binary doesn’t seem to make a significant difference as other binaries like WoodDeck and OpenPorch. So, it’s not worth using alone, but might be useful in interaction with the area in a linear regression. Decision trees should be able to do without the binary feature. The transformation may help KNN, but like a decision tree, I would want to avoid overweighting by including both.
fenced_jbv(
data = val_train_Xy,
x = 'ScreenPorch.bin.fact',
y = 'log(SalePrice)',
jit_col = 'YearBuilt',
jit_alpha = 1
) +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
ggplot(val_train_Xy, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
print("Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`")
## [1] "Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
## [1] 0.2266884
df = filter(val_train_Xy, ScreenPorch != 0)
ggplot(df, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
print("Exluding ScreenPorch 0s:")
## [1] "Exluding ScreenPorch 0s:"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
## [1] 0.2266884
This feature doesn’t seem to have much to offer. But, I’ll leave it anyway and let feature selection during modeling suss that out.
val_train_Xy = select(
val_train_Xy, -c('ScreenPorch.bin.fact', 'Win(cbrt(ScreenPorch))')
)
summary(val_train_Xy$PoolQC)
## None Po Fa TA Gd Ex
## 712 0 1 0 1 1
val_train_Xy = select(val_train_Xy, -c('PoolArea', 'PoolQC'))
x = 'Fence'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None" "MnPrv"
val_train_Xy = val_train_Xy %>%
mutate('Fence.bin' = ifelse(Fence == 'None', 0, 1)) %>%
mutate('Fence.bin.fact' = factor(Fence.bin, ordered = T))
x = 'Fence.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
Having a Fence seems to detract value, probably due to an interaction with another variable as we saw with EnclosedPorch and age. Unlike EnclosedPorch, it’s not immediately obvious what the other variable is, and “controlling” for it with a linear regression may actually increase Fence’s significance. So, rather than hunting for the other variable(s) or dropping this one, I’ll leave it and see if ML modeling can make use of it.
val_train_Xy = select(
val_train_Xy,
-c('Fence.bin', 'Fence.bin.fact')
)
MiscVal is kind of a cheater variable. It should have precisely 1 for its coefficient. Otherwise, I would just drop it for so few observations; it might just throw of the regression. If I keep it, it should be transformed in the same way that the target variable is.
It also looks like the presence of a miscellaneous improvement is associated with a lower price if anything.
x = 'MiscFeature'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
x = 'MiscVal'
summary(val_train_Xy[x])
## MiscVal
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 54.19
## 3rd Qu.: 0.00
## Max. :15500.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MiscVal != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 200,
t_binw = .1
)
## NULL
x = 'log2(MiscVal)'
val_train_Xy = val_train_Xy %>%
mutate(
'log2(MiscVal)' = ifelse(
MiscVal == 0,
0,
log2(MiscVal)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## log2(MiscVal)
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.2708
## 3rd Qu.: 0.0000
## Max. :13.9200
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$`log2(MiscVal)` != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
## NULL
Alright, I’ll give it a natural log like SalePrice, since it is a straight dollar value.
x = 'log(MiscVal)'
val_train_Xy = val_train_Xy %>%
mutate(
'log(MiscVal)' = ifelse(
MiscVal == 0,
0,
log(MiscVal)
)
)
df = filter(val_train_Xy, MiscVal != 0)
gg = ggplot(df, aes(x = `log(MiscVal)`))
p1 = gg + geom_histogram(binwidth = 0.1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
Since this variable is an actual dollar value, Winsorizing it doesn’t really make sense. I’ll check it out anyway.
qqnorm(y = df$MiscVal, ylab = 'MiscVal')
qqline(y = df$MiscVal)
qqnorm(y = df$`log(MiscVal)`, ylab = 'log(MiscVal)')
qqline(y = df$`log(MiscVal)`)
Win_log_x = Winsorize(
df$`log(MiscVal)`,
probs = c(0.007, 0.95),
na.rm = T
)
qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x)
Win_raw_x = Winsorize(
df$MiscVal,
probs = c(0, 0.95)
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)
print(shapiro.test(x = df$MiscVal))
##
## Shapiro-Wilk normality test
##
## data: df$MiscVal
## W = 0.49238, p-value = 2.714e-07
print(shapiro.test(x = df$`log(MiscVal)`))
##
## Shapiro-Wilk normality test
##
## data: df$`log(MiscVal)`
## W = 0.88923, p-value = 0.02603
print(shapiro.test(x = Win_log_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log_x
## W = 0.89412, p-value = 0.03203
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.5586, p-value = 1.129e-06
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log(MiscVal))' = ifelse(
MiscVal == 0,
0,
Winsorize(
log(MiscVal),
minval = min(Win_log_x),
maxval = max(Win_log_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('MiscVal.bin' = ifelse(MiscVal == 0, 0, 1)) %>%
mutate('MiscVal.bin.fact' = factor(MiscVal.bin, ordered = T))
x = 'MiscVal.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MiscVal', 'log2(MiscVal)', 'log(MiscVal)',
'Win(log(MiscVal))', 'MiscVal.bin')
x = 'log(MiscVal)'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## MiscVal log2(MiscVal) log(MiscVal) Win(log(MiscVal))
## Min. :0.001150 Min. :0.001803 Min. :0.001803 Min. :0.002117
## 1st Qu.:0.006437 1st Qu.:0.013649 1st Qu.:0.013649 1st Qu.:0.013673
## Median :0.013835 Median :0.028586 Median :0.028586 Median :0.028980
## Mean :0.018486 Mean :0.032804 Mean :0.032804 Mean :0.032855
## 3rd Qu.:0.025099 3rd Qu.:0.047086 3rd Qu.:0.047086 3rd Qu.:0.046462
## Max. :0.083097 Max. :0.087143 Max. :0.087143 Max. :0.086933
## MiscVal.bin
## Min. :0.00218
## 1st Qu.:0.01782
## Median :0.03383
## Mean :0.03774
## 3rd Qu.:0.05123
## Max. :0.09574
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## MiscVal log2(MiscVal) log(MiscVal) Win(log(MiscVal))
## Min. :0.006626 Min. :0.009576 Min. :0.009576 Min. :0.001031
## 1st Qu.:0.050062 1st Qu.:0.121361 1st Qu.:0.121361 1st Qu.:0.114223
## Median :0.123861 Median :0.233112 Median :0.233112 Median :0.264792
## Mean :0.139666 Mean :0.254203 Mean :0.254203 Mean :0.270993
## 3rd Qu.:0.190636 3rd Qu.:0.369457 3rd Qu.:0.369457 3rd Qu.:0.415090
## Max. :0.458312 Max. :0.590246 Max. :0.590246 Max. :0.610388
##
## MiscVal.bin
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :58
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
MiscVal appears to be correlated with the size of the lot and house. Perhaps that will do the heavy lifting and make MiscVal obsolete.
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `Win(LotArea)`,
val_train_Xy
)$residuals
print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`")
## [1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
## [1] -0.08294709
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")
df = filter(val_train_Xy, MiscVal != 0)
print("Exluding MiscVal 0s:")
## [1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
## [1] 0.4032275
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `log10(log10(LotArea))`,
val_train_Xy
)$residuals
print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`")
## [1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
## [1] -0.0788646
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")
df = filter(val_train_Xy, MiscVal != 0)
print("Exluding MiscVal 0s:")
## [1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
## [1] 0.4945909
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")
After factoring in Lot Area, there’s still some correlation between MiscVal and the target once 0s are removed. It may still be worth keeping. We’ll let modeling suss that out.
val_train_Xy = select(
val_train_Xy, -c('MiscVal.bin', 'MiscVal.bin.fact', 'log2(MiscVal)')
)
MoSold somewhat normally distributed around June/July. Conventional wisdom says that summer sales are higher in volume and price, but the data don’t bear that out for price.
Records go from January 2006 through July 2010, so August-December are underrepresented by roughly 20%.
YrSold pretty uniform (about 140-160) except for 2010 which ended in July in this set.
Interesting spike in sales in Spring 2010. Foreclosures coming onto market?
val_train_Xy = val_train_Xy %>%
mutate('MoSold.fact' = factor(MoSold)) %>%
mutate('YrSold.fact' = factor(YrSold, ordered = T))
x = 'MoSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "7" "5" "9" "12"
x = 'YrSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
val_train_Xy = val_train_Xy %>%
mutate(
SoldDate = as.Date(
paste(
as.character(YrSold),
as.character(MoSold),
'15',
sep = '/'
),
format = '%Y/%m/%d'
)
)
ggplot(val_train_Xy, aes(x = SoldDate)) +
geom_bar()
x = 'SoldDate'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('SoldDate')
#
# df = get_cors(
# data = filter(
# select(val_train_Xy, all_of(num_feats)),
# !is.na(.data[[x]])
# ),
# x_lst = x_lst,
# feats = num_feats
# )
# df
# print("Summary of absolute values of Pearson's Rs:")
# df = abs(df)
# summary(abs(df))
#
# df = melt(df)
# ggplot(df, aes(x = variable, y = value)) +
# geom_boxplot(notch = T) +
# ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
ggplot(
val_train_Xy,
aes(x = SoldDate, y = `log(SalePrice)`)
) +
geom_point() +
geom_smooth() +
geom_smooth(method = 'lm', color = 'Yellow') +
facet_grid(rows = vars(SaleCondition), cols = vars(SaleType))
None of date seems to matter in this set. Drop it, but no yet.
x = 'SaleType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "WD" "New"
df = val_train_Xy[val_train_Xy$SaleType %in% c('WD', 'New', 'COD'), ]
gg = ggplot(df, aes(x = SaleType))
gg + geom_bar() +
facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg + geom_bar(position = 'dodge', aes(fill = factor(YrSold)))
ggplot(df, aes(color = SaleType, x = SoldDate)) +
geom_freqpoly()
# ggplot(df, aes(x = SaleType, y = SalePrice)) +
# geom_col(position = 'dodge', aes(fill = factor(YrSold), stat = 'mean'))
ggplot(df, aes(x = SaleType, y = `log(SalePrice)`)) +
geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))
You can see new home sales drop off with the crash. Court officer deeds also ticked up, possibly due to more foreclosures. But, there didn’t seem to be a significant difference in sale prices year over year within sale type groups, except between new sales in 2007and 2010 which is not fully represented in this set.
592 normal, 61 partial (home not completed when last assessed), 43 abnormal (trade, foreclosure, short sale), 11 family. I’m guessing a family sale will be lower in price typically, as will foreclosures and shortsales. I’m not sure what to make of partials; the house wasn’t fully built when assessed, so the price may be askew?
Surprisingly, abnormal sales didn’t seem to vary with the crash. Ames appears to have fared well.
x = 'SaleCondition'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Normal" "Abnorml" "Partial"
df = filter(val_train_Xy, SaleCondition %in% c('Normal', 'Abnorml', 'Partial'))
gg = ggplot(df, aes(x = SaleCondition))
gg + geom_bar() +
facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg + geom_bar(aes(fill = factor(YrSold)), position = 'dodge')
ggplot(df, aes(x = SoldDate, color = SaleCondition)) +
geom_freqpoly()
ggplot(df, aes(x = SaleCondition, y = `log(SalePrice)`)) +
geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))
Bearing in mind that 2010 is not a complete year in this set, partial sales dropped as normal sales increased. This trend may be explained by developers finishing/halting their projects. Filtering/grouping by year built and/or neighborhood might help check this, but I’ll skip it for the sake of finishing.
Fall and winter months seemed to be where the bulk of these increases in normal sales fell each year.
To wrap up this notebook, and as a preliminary gauge on how well I have prepared the data for ML, mainly for linear regression, I’ll check out how correlations have changed. Have my variables increased in correlation in general? Have they increased in correlation to the target variable?
Increased correlations to the target variable have obvious benefits. Increased correlations between predictor variables will help clarify which variables may overlap in their predictive power and redundantly overweight the same underlying information.
I made a lot of new features, dropping some along the way. In cases where a Winsorized version seemed a good option, I also kept the scale-transformed version where applicable. For example, I kept both log(SalePrice) and Win(log(SalePrice)). Winsorization will help a straightforward linear regression without interactions. But, Winsorization will undercut KNN’s ability to cluster multivariate outliers and similarly RF’s ability to group by extremes. That said, Winsorization will reduce the chances of overfit in all three. All that is to say that there are redundant new features.
I can’t think of a quick and dirty way to view this without getting skewed results or busy heatmaps, other than to make a table of all the variables’ correlations to the target variable which isn’t very easily digested either. The slow and tedious way would be to iteratively “manually” (to some extent) select like variables for correlation within their experimental group. I’m not goin to do that.
val_train_Xy_numeric = select(
val_train_Xy, # Reorder for easier comparison.
c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))', "LotFrontage",
"log10(LotFrontage)", "Win(log10(LotFrontage))", "Win(LotFrontage)",
"LotArea", "log10(log10(LotArea))", "Win(LotArea)", "OverallQual_int",
"OverallCond_int", "YearBuilt", "sqrt(Age)", "YearRemodAdd", "MasVnrArea",
"cbrt(MasVnrArea)", "Win(cbrt(MasVnrArea))", "BsmtFinSF1", "BsmtFinSF2",
"BsmtUnfSF", "cbrt(BsmtUnfSF)", "square(log(TotalBsmtSF))",
"Win(square(log(TotalBsmtSF)))", "TotalBsmtSF", "TotalBsmtFinSF",
"sqrt(TotalBsmtFinSF)", "Win(sqrt(TotalBsmtFinSF))", "X2ndFlrSF",
"X2ndFlr.bin", "GrLivArea", "square(log2(GrLivArea))",
"Win(square(log2(GrLivArea)))", "TotBaths", "Win(TotBaths)",
"BedroomAbvGr", "Win(BedroomAbvGr)", "TotRmsAbvGrd", "Win(BedroomAbvGr)",
"TotRmsAbvGrd", "Win(TotRmsAbvGrd)", "Fireplaces.bin", "GarageCars",
"Win(GarageCars)", "WoodDeckSF", "cbrt(WoodDeckSF)",
"Win(cbrt(WoodDeckSF))", "OpenPorchSF", "OpenPorch.bin", "EnclosedPorch",
"ScreenPorch", "cbrt(ScreenPorch)", "ScreenPorch.bin", "MiscVal",
"log(MiscVal)", "Win(log(MiscVal))", "MoSold", "YrSold")
)
ggcorr(val_train_Xy_numeric)
cor_mtx = cor(val_train_Xy_numeric, use = 'pairwise.complete.obs')
target_vars_vec = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))')
cor_mtx_melted = melt(cor_mtx)
sales_cor_mtx_melted = filter(
cor_mtx_melted,
Var1 %in% target_vars_vec & !(Var2 %in% target_vars_vec)
)
ggplot(sales_cor_mtx_melted, aes(x = Var1, y = Var2)) +
geom_tile(aes(fill = value))
dcast(sales_cor_mtx_melted, formula = Var2 ~ Var1)
fenced_jbv(
data = sales_cor_mtx_melted,
x = 'Var1',
y = 'value',
jit_h = 0
)
I’ll write it to an RDS file so I can verify that my final engineering script duplicates this process. I’ll verify in the next notebook before I start modeling.
val_train_Xy$Id = id_col
saveRDS(val_train_Xy, 'data/eda_val_train_Xy.rds')
head(readRDS('data/eda_val_train_Xy.rds'))